Python 3.11.3 (v3.11.3:f3909b8bc8, Apr 4 2023, 20:12:10) [Clang 13.0.0 (clang-1300.0.29.30)]
Type 'copyright', 'credits' or 'license' for more information
IPython 8.26.0 -- An enhanced Interactive Python. Type '?' for help.

In [ ]:
for col in outlier_columns:
    dataset = mark_outliers_chauvenet(df, col)
    plot_binary_outliers(dataset=dataset, col=col, outlier_col=col + "_outlier", reset_index=True)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
/Users/bogdanduminica/Desktop/tracking-barbell-exercises/src/features/remove_outliers.py in line 1
----> <a href='file:///Users/bogdanduminica/Desktop/tracking-barbell-exercises/src/features/remove_outliers.py?line=184'>185</a> for col in outlier_columns:
      <a href='file:///Users/bogdanduminica/Desktop/tracking-barbell-exercises/src/features/remove_outliers.py?line=185'>186</a>     dataset = mark_outliers_chauvenet(df, col)
      <a href='file:///Users/bogdanduminica/Desktop/tracking-barbell-exercises/src/features/remove_outliers.py?line=186'>187</a>     plot_binary_outliers(dataset=dataset, col=col, outlier_col=col + "_outlier", reset_index=True)

NameError: name 'outlier_columns' is not defined
In [ ]:
outlier_columns = list(df.columns[:6])
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
/Users/bogdanduminica/Desktop/tracking-barbell-exercises/src/features/remove_outliers.py in line 1
----> <a href='file:///Users/bogdanduminica/Desktop/tracking-barbell-exercises/src/features/remove_outliers.py?line=13'>14</a> outlier_columns = list(df.columns[:6])

NameError: name 'df' is not defined
In [ ]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
import scipy
from sklearn.neighbors import LocalOutlierFactor  # pip install scikit-learn
# --------------------------------------------------------------
# Load data
# --------------------------------------------------------------

df = pd.read_pickle("../../data/interim/processed_data_01.pkl")
outlier_columns = list(df.columns[:6])
# --------------------------------------------------------------
# Plotting outliers
# --------------------------------------------------------------
#print(plt.style.available)

plt.style.use("fivethirtyeight")
plt.rcParams["figure.figsize"] = (20, 5)
plt.rcParams["figure.dpi"] = 100
df[["acc_x", "label"]].boxplot(by = "label", figsize = (20, 10))
plt.show()
temp = ['acc_x', 'acc_y', 'acc_z']  
columns_to_plot = temp[:3] + ["label"]
df[columns_to_plot].boxplot(by="label", figsize=(20, 10), layout = (1, 3))
plt.show()
df[outlier_columns[3:] + ["label"]].boxplot(by = "label", figsize = (20, 10), layout = (1, 3))
plt.show()
def plot_binary_outliers(dataset, col, outlier_col, reset_index):
    """ Plot outliers in case of a binary outlier score. Here, the col specifies the real data
    column and outlier_col the columns with a binary value (outlier or not).
    Args:
        dataset (pd.DataFrame): The dataset
        col (string): Column that you want to plot
        outlier_col (string): Outlier column marked with true/false
        reset_index (bool): whether to reset the index for plotting
    """
    # Taken from: https://github.com/mhoogen/ML4QS/blob/master/Python3Code/util/VisualizeDataset.py
    dataset = dataset.dropna(axis=0, subset=[col, outlier_col])
    dataset[outlier_col] = dataset[outlier_col].astype("bool")
    if reset_index:
        dataset = dataset.reset_index()
    fig, ax = plt.subplots()
    plt.xlabel("samples")
    plt.ylabel("value")
    # Plot non outliers in default color
    ax.plot(
        dataset.index[~dataset[outlier_col]],
        dataset[col][~dataset[outlier_col]],
        "+",
    )
    # Plot data points that are outliers in red
    ax.plot(
        dataset.index[dataset[outlier_col]],
        dataset[col][dataset[outlier_col]],
        "r+",
    )
    plt.legend(
        ["outlier " + col, "no outlier " + col],
        loc="upper center",
        ncol=2,
        fancybox=True,
        shadow=True,
    )
    plt.show()
# --------------------------------------------------------------
# Interquartile range (distribution based)
# --------------------------------------------------------------
# Insert IQR function

def mark_outliers_iqr(dataset, col):
    """Function to mark values as outliers using the IQR method.
    Args:
        dataset (pd.DataFrame): The dataset
        col (string): The column you want apply outlier detection to
    Returns:
        pd.DataFrame: The original dataframe with an extra boolean column 
        indicating whether the value is an outlier or not.
    """
    dataset = dataset.copy()
    Q1 = dataset[col].quantile(0.25)
    Q3 = dataset[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    dataset[col + "_outlier"] = (dataset[col] < lower_bound) | (
        dataset[col] > upper_bound
    )
    return dataset
# Plot a single column

col = "acc_x"
dataset = mark_outliers_iqr(df, col)
plot_binary_outliers(dataset=dataset, col=col, outlier_col=col + "_outlier", reset_index=True)
# Loop over all columns

for col in outlier_columns:
    dataset = mark_outliers_iqr(df, col)
    plot_binary_outliers(dataset=dataset, col=col, outlier_col=col + "_outlier", reset_index=True)
# --------------------------------------------------------------
# Chauvenets criteron (distribution based)
# --------------------------------------------------------------
# Check for normal distribution

df[outlier_columns[:3] + ["label"]].plot.hist(by = "label", figsize = (20, 10), layout = (3, 3))
plt.show()
df[outlier_columns[3:] + ["label"]].plot.hist(by = "label", figsize = (20, 10), layout = (3, 3))
plt.show()
# Insert Chauvenet's function
# it assumes a normal distribution of the data

def mark_outliers_chauvenet(dataset, col, C=2):
    """Finds outliers in the specified column of datatable and adds a binary column with
    the same name extended with '_outlier' that expresses the result per data point.
    Taken from: https://github.com/mhoogen/ML4QS/blob/master/Python3Code/Chapter3/OutlierDetection.py
    Args:
        dataset (pd.DataFrame): The dataset
        col (string): The column you want apply outlier detection to
        C (int, optional): Degree of certainty for the identification of outliers given the assumption 
                           of a normal distribution, typicaly between 1 - 10. Defaults to 2.
    Returns:
        pd.DataFrame: The original dataframe with an extra boolean column 
        indicating whether the value is an outlier or not.
    """
    dataset = dataset.copy()
    # Compute the mean and standard deviation.
    mean = dataset[col].mean()
    std = dataset[col].std()
    N = len(dataset.index)
    criterion = 1.0 / (C * N)
    # Consider the deviation for the data points.
    deviation = abs(dataset[col] - mean) / std
    # Express the upper and lower bounds.
    low = -deviation / math.sqrt(C)
    high = deviation / math.sqrt(C)
    prob = []
    mask = []
    # Pass all rows in the dataset.
    for i in range(0, len(dataset.index)):
        # Determine the probability of observing the point
        prob.append(
            1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
        )
        # And mark as an outlier when the probability is below our criterion.
        mask.append(prob[i] < criterion)
    dataset[col + "_outlier"] = mask
    return dataset
In [ ]:
for col in outlier_columns:
    dataset = mark_outliers_chauvenet(df, col)
    plot_binary_outliers(dataset=dataset, col=col, outlier_col=col + "_outlier", reset_index=True)
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
In [ ]:
def mark_outliers_lof(dataset, columns, n=20):
    """Mark values as outliers using LOF
    Args:
        dataset (pd.DataFrame): The dataset
        col (string): The column you want apply outlier detection to
        n (int, optional): n_neighbors. Defaults to 20.
    Returns:
        pd.DataFrame: The original dataframe with an extra boolean column
        indicating whether the value is an outlier or not.
    """
    dataset = dataset.copy()
    lof = LocalOutlierFactor(n_neighbors=n)
    data = dataset[columns]
    outliers = lof.fit_predict(data)
    X_scores = lof.negative_outlier_factor_
    dataset["outlier_lof"] = outliers == -1
    return dataset, outliers, X_scores
In [ ]:
dataset, outliers, X_scores = mark_outliers_lof(df, outlier_columns)
In [ ]:
dataset
Out[ ]:
acc_x acc_y acc_z gyr_x gyr_y gyr_z label category participant set outlier_lof
epoch (ms)
2019-01-11 15:08:05.200 0.013500 0.977000 -0.071000 -1.8904 2.4392 0.9388 bench heavy B 64 False
2019-01-11 15:08:05.400 -0.001500 0.970500 -0.079500 -1.6826 -0.8904 2.1708 bench heavy B 64 False
2019-01-11 15:08:05.600 0.001333 0.971667 -0.064333 2.5608 -0.2560 -1.4146 bench heavy B 64 False
2019-01-11 15:08:05.800 -0.024000 0.957000 -0.073500 8.0610 -4.5244 -2.0730 bench heavy B 64 False
2019-01-11 15:08:06.000 -0.028000 0.957667 -0.115000 2.4390 -1.5486 -3.6098 bench heavy B 64 False
... ... ... ... ... ... ... ... ... ... ... ...
2019-01-20 17:33:27.000 -0.048000 -1.041500 -0.076500 1.4146 -5.6218 0.2926 row medium E 71 False
2019-01-20 17:33:27.200 -0.037000 -1.030333 -0.053333 -2.7684 -0.5854 2.2440 row medium E 71 False
2019-01-20 17:33:27.400 -0.060000 -1.031000 -0.082000 2.8416 -5.1342 -0.1220 row medium E 71 False
2019-01-20 17:33:27.600 -0.038667 -1.025667 -0.044667 -0.2318 0.2562 1.1220 row medium E 71 False
2019-01-20 17:33:27.800 -0.044000 -1.034000 -0.059000 1.0980 -4.0240 0.9760 row medium E 71 False

9009 rows × 11 columns

In [ ]:
outliers
Out[ ]:
array([1, 1, 1, ..., 1, 1, 1])
In [ ]:
X_scores
Out[ ]:
array([-1.02773582, -1.07699558, -1.15029777, ..., -0.99433498,
       -0.98962988, -0.98354443])
In [ ]:
for col in outlier_columns:
    plot_binary_outliers(dataset=dataset, col=col, outlier_col="outlier_lof", reset_index=True)
In [ ]:
label = "bench"
for col in outlier_columns:
    dataset = mark_outliers_iqr(df[df["label"] == label], col)
    plot_binary_outliers(dataset, col, col + "_outlier", reset_index=True)
In [ ]:
label = "squat"
In [ ]:
for col in outlier_columns:
    dataset = mark_outliers_iqr(df[df["label"] == label], col)
    plot_binary_outliers(dataset, col, col + "_outlier", reset_index=True)
In [ ]:
label = "bench"
for col in outlier_columns:
    dataset = mark_outliers_chauvenet(df[df["label"] == label], col)
    plot_binary_outliers(dataset, col, col + "_outlier", reset_index=True)
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
In [ ]:
label = "bench"
In [ ]:
for col in outlier_columns:
    dataset = mark_outliers_chauvenet(df[df["label"] == label], col)
    plot_binary_outliers(dataset, col, col + "_outlier", reset_index=True)
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
In [ ]:
label = "bench"
In [ ]:
for col in outlier_columns:
    dataset = mark_outliers_lof(df[df["label"] == label], col)
    plot_binary_outliers(dataset, col, col + "_outlier", reset_index=True)
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
/Users/bogdanduminica/Desktop/tracking-barbell-exercises/src/features/remove_outliers.py in line 2
      <a href='file:///Users/bogdanduminica/Desktop/tracking-barbell-exercises/src/features/remove_outliers.py?line=245'>246</a> for col in outlier_columns:
----> <a href='file:///Users/bogdanduminica/Desktop/tracking-barbell-exercises/src/features/remove_outliers.py?line=246'>247</a>     dataset = mark_outliers_lof(df[df["label"] == label], col)
      <a href='file:///Users/bogdanduminica/Desktop/tracking-barbell-exercises/src/features/remove_outliers.py?line=247'>248</a>     plot_binary_outliers(dataset, col, col + "_outlier", reset_index=True)

/Users/bogdanduminica/Desktop/tracking-barbell-exercises/src/features/remove_outliers.py in line 14, in mark_outliers_lof(dataset, columns, n)
     <a href='file:///Users/bogdanduminica/Desktop/tracking-barbell-exercises/src/features/remove_outliers.py?line=206'>207</a> lof = LocalOutlierFactor(n_neighbors=n)
     <a href='file:///Users/bogdanduminica/Desktop/tracking-barbell-exercises/src/features/remove_outliers.py?line=207'>208</a> data = dataset[columns]
---> <a href='file:///Users/bogdanduminica/Desktop/tracking-barbell-exercises/src/features/remove_outliers.py?line=208'>209</a> outliers = lof.fit_predict(data)
     <a href='file:///Users/bogdanduminica/Desktop/tracking-barbell-exercises/src/features/remove_outliers.py?line=209'>210</a> X_scores = lof.negative_outlier_factor_
     <a href='file:///Users/bogdanduminica/Desktop/tracking-barbell-exercises/src/features/remove_outliers.py?line=210'>211</a> dataset["outlier_lof"] = outliers == -1

File /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/neighbors/_lof.py:256, in LocalOutlierFactor.fit_predict(self, X, y)
    <a href='file:///Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/neighbors/_lof.py?line=231'>232</a> """Fit the model to the training set X and return the labels.
    <a href='file:///Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/neighbors/_lof.py?line=232'>233</a> 
    <a href='file:///Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/neighbors/_lof.py?line=233'>234</a> **Not available for novelty detection (when novelty is set to True).**
   (...)
    <a href='file:///Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/neighbors/_lof.py?line=249'>250</a>     Returns -1 for anomalies/outliers and 1 for inliers.
    <a href='file:///Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/neighbors/_lof.py?line=250'>251</a> """
    <a href='file:///Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/neighbors/_lof.py?line=252'>253</a> # As fit_predict would be different from fit.predict, fit_predict is
    <a href='file:///Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/neighbors/_lof.py?line=253'>254</a> # only available for outlier detection (novelty=False)
--> <a href='file:///Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/neighbors/_lof.py?line=255'>256</a> return self.fit(X)._predict()

File /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/base.py:1473, in _fit_context.<locals>.decorator.<locals>.wrapper(estimator, *args, **kwargs)
   <a href='file:///Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/base.py?line=1465'>1466</a>     estimator._validate_params()
   <a href='file:///Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/base.py?line=1467'>1468</a> with config_context(
   <a href='file:///Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/base.py?line=1468'>1469</a>     skip_parameter_validation=(
   <a href='file:///Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/base.py?line=1469'>1470</a>         prefer_skip_nested_validation or global_skip_validation
   <a href='file:///Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/base.py?line=1470'>1471</a>     )
   <a href='file:///Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/base.py?line=1471'>1472</a> ):
-> <a href='file:///Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/base.py?line=1472'>1473</a>     return fit_method(estimator, *args, **kwargs)

File /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/neighbors/_lof.py:279, in LocalOutlierFactor.fit(self, X, y)
    <a href='file:///Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/neighbors/_lof.py?line=257'>258</a> @_fit_context(
    <a href='file:///Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/neighbors/_lof.py?line=258'>259</a>     # LocalOutlierFactor.metric is not validated yet
    <a href='file:///Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/neighbors/_lof.py?line=259'>260</a>     prefer_skip_nested_validation=False
    <a href='file:///Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/neighbors/_lof.py?line=260'>261</a> )
    <a href='file:///Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/neighbors/_lof.py?line=261'>262</a> def fit(self, X, y=None):
    <a href='file:///Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/neighbors/_lof.py?line=262'>263</a>     """Fit the local outlier factor detector from the training dataset.
    <a href='file:///Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/neighbors/_lof.py?line=263'>264</a> 
    <a href='file:///Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/neighbors/_lof.py?line=264'>265</a>     Parameters
   (...)
    <a href='file:///Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/neighbors/_lof.py?line=276'>277</a>         The fitted local outlier factor detector.
    <a href='file:///Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/neighbors/_lof.py?line=277'>278</a>     """
--> <a href='file:///Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/neighbors/_lof.py?line=278'>279</a>     self._fit(X)
    <a href='file:///Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/neighbors/_lof.py?line=280'>281</a>     n_samples = self.n_samples_fit_
    <a href='file:///Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/neighbors/_lof.py?line=281'>282</a>     if self.n_neighbors > n_samples:

File /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/neighbors/_base.py:517, in NeighborsBase._fit(self, X, y)
    <a href='file:///Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/neighbors/_base.py?line=514'>515</a> else:
    <a href='file:///Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/neighbors/_base.py?line=515'>516</a>     if not isinstance(X, (KDTree, BallTree, NeighborsBase)):
--> <a href='file:///Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/neighbors/_base.py?line=516'>517</a>         X = self._validate_data(X, accept_sparse="csr", order="C")
    <a href='file:///Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/neighbors/_base.py?line=518'>519</a> self._check_algorithm_metric()
    <a href='file:///Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/neighbors/_base.py?line=519'>520</a> if self.metric_params is None:

File /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/base.py:633, in BaseEstimator._validate_data(self, X, y, reset, validate_separately, cast_to_ndarray, **check_params)
    <a href='file:///Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/base.py?line=630'>631</a>         out = X, y
    <a href='file:///Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/base.py?line=631'>632</a> elif not no_val_X and no_val_y:
--> <a href='file:///Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/base.py?line=632'>633</a>     out = check_array(X, input_name="X", **check_params)
    <a href='file:///Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/base.py?line=633'>634</a> elif no_val_X and not no_val_y:
    <a href='file:///Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/base.py?line=634'>635</a>     out = _check_y(y, **check_params)

File /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/utils/validation.py:1050, in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_writeable, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator, input_name)
   <a href='file:///Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/utils/validation.py?line=1042'>1043</a>         else:
   <a href='file:///Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/utils/validation.py?line=1043'>1044</a>             msg = (
   <a href='file:///Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/utils/validation.py?line=1044'>1045</a>                 f"Expected 2D array, got 1D array instead:\narray={array}.\n"
   <a href='file:///Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/utils/validation.py?line=1045'>1046</a>                 "Reshape your data either using array.reshape(-1, 1) if "
   <a href='file:///Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/utils/validation.py?line=1046'>1047</a>                 "your data has a single feature or array.reshape(1, -1) "
   <a href='file:///Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/utils/validation.py?line=1047'>1048</a>                 "if it contains a single sample."
   <a href='file:///Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/utils/validation.py?line=1048'>1049</a>             )
-> <a href='file:///Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/utils/validation.py?line=1049'>1050</a>         raise ValueError(msg)
   <a href='file:///Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/utils/validation.py?line=1051'>1052</a> if dtype_numeric and hasattr(array.dtype, "kind") and array.dtype.kind in "USV":
   <a href='file:///Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/utils/validation.py?line=1052'>1053</a>     raise ValueError(
   <a href='file:///Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/utils/validation.py?line=1053'>1054</a>         "dtype='numeric' is not compatible with arrays of bytes/strings."
   <a href='file:///Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/utils/validation.py?line=1054'>1055</a>         "Convert your data to numeric values explicitly instead."
   <a href='file:///Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/utils/validation.py?line=1055'>1056</a>     )

ValueError: Expected a 2-dimensional container but got <class 'pandas.core.series.Series'> instead. Pass a DataFrame containing a single row (i.e. single sample) or a single column (i.e. single feature) instead.
In [ ]:
dataset, outliers, X_scores = mark_outliers_lof(df, outlier_columns)
In [ ]:
for col in outlier_columns:
    plot_binary_outliers(dataset=dataset, col=col, outlier_col="outlier_lof", reset_index=True)
In [ ]:
col = "gyr_z"
dataset = mark_outliers_chauvenet(df, col = col)
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
In [ ]:
dataset
Out[ ]:
acc_x acc_y acc_z gyr_x gyr_y gyr_z label category participant set gyr_z_outlier
epoch (ms)
2019-01-11 15:08:05.200 0.013500 0.977000 -0.071000 -1.8904 2.4392 0.9388 bench heavy B 64 False
2019-01-11 15:08:05.400 -0.001500 0.970500 -0.079500 -1.6826 -0.8904 2.1708 bench heavy B 64 False
2019-01-11 15:08:05.600 0.001333 0.971667 -0.064333 2.5608 -0.2560 -1.4146 bench heavy B 64 False
2019-01-11 15:08:05.800 -0.024000 0.957000 -0.073500 8.0610 -4.5244 -2.0730 bench heavy B 64 False
2019-01-11 15:08:06.000 -0.028000 0.957667 -0.115000 2.4390 -1.5486 -3.6098 bench heavy B 64 False
... ... ... ... ... ... ... ... ... ... ... ...
2019-01-20 17:33:27.000 -0.048000 -1.041500 -0.076500 1.4146 -5.6218 0.2926 row medium E 71 False
2019-01-20 17:33:27.200 -0.037000 -1.030333 -0.053333 -2.7684 -0.5854 2.2440 row medium E 71 False
2019-01-20 17:33:27.400 -0.060000 -1.031000 -0.082000 2.8416 -5.1342 -0.1220 row medium E 71 False
2019-01-20 17:33:27.600 -0.038667 -1.025667 -0.044667 -0.2318 0.2562 1.1220 row medium E 71 False
2019-01-20 17:33:27.800 -0.044000 -1.034000 -0.059000 1.0980 -4.0240 0.9760 row medium E 71 False

9009 rows × 11 columns

In [ ]:
dataset[dataset["gyr_z_outlier"]]
Out[ ]:
acc_x acc_y acc_z gyr_x gyr_y gyr_z label category participant set gyr_z_outlier
epoch (ms)
2019-01-14 13:57:41.800 -0.137000 1.495500 0.217000 33.9146 14.6462 -98.8294 ohp heavy C 34 True
2019-01-18 17:22:40.600 0.915500 -0.302500 -0.047500 16.5364 30.1952 119.8050 rest sitting A 6 True
2019-01-18 17:22:40.800 0.846667 -0.645667 0.174333 -49.5610 63.7196 104.2684 rest sitting A 6 True
2019-01-18 17:22:49.600 1.363667 0.030000 -0.001333 -45.7196 -8.1218 -168.9514 rest sitting A 6 True
2019-01-18 17:22:49.800 0.707500 0.390500 0.181500 -100.5976 -33.9634 -177.6098 rest sitting A 6 True
2019-01-18 17:22:52.000 0.371333 0.580333 0.323333 44.3416 75.4390 132.6100 rest sitting A 6 True
2019-01-18 17:22:52.200 0.744500 0.272000 0.261500 23.2562 22.4388 196.3294 rest sitting A 6 True
2019-01-18 17:22:59.600 0.800000 -0.406333 0.121667 -35.7436 28.3414 127.8780 rest sitting A 6 True
2019-01-18 17:25:40.400 0.623500 -0.247500 0.226000 -5.7562 55.8294 175.6464 rest standing A 36 True
2019-01-18 17:25:40.600 0.669333 -0.957000 0.440667 -48.6584 85.9266 178.5730 rest standing A 36 True
2019-01-18 17:26:04.800 1.115000 -0.792500 0.809000 91.5732 21.1710 -209.7562 rest standing A 36 True
2019-01-18 17:26:05.000 0.714000 -0.101000 0.646000 98.5000 127.2804 -209.8538 rest standing A 36 True
2019-01-18 17:26:07.400 -0.217667 0.465667 0.320333 -108.5002 -117.7196 167.3170 rest standing A 36 True
2019-01-18 17:26:07.600 0.616000 -0.205500 0.625000 -207.6098 -96.8414 269.0854 rest standing A 36 True
2019-01-18 17:26:07.800 1.070333 -1.238333 0.751000 -79.0244 -103.5126 179.7196 rest standing A 36 True
2019-01-18 17:26:09.400 0.773667 -1.072000 0.310000 -8.6950 -61.7318 -162.9390 rest standing A 36 True
2019-01-18 17:26:09.600 1.464000 -0.902000 0.083000 -128.5364 -75.0488 -338.1708 rest standing A 36 True
2019-01-18 17:26:09.800 0.709333 -0.014667 -0.120667 -235.5244 -187.9512 -169.1826 rest standing A 36 True
2019-01-18 17:26:11.600 0.419000 0.175500 -0.176500 171.8048 103.7316 267.3416 rest standing A 36 True
2019-01-18 17:26:11.800 1.155333 -0.807667 -0.160333 -81.4878 233.1832 248.3416 rest standing A 36 True
2019-01-19 17:22:40.600 0.915500 -0.302500 -0.047500 16.5364 30.1952 119.8050 rest sitting A 62 True
2019-01-19 17:22:40.800 0.846667 -0.645667 0.174333 -49.5610 63.7196 104.2684 rest sitting A 62 True
2019-01-19 17:22:49.600 1.363667 0.030000 -0.001333 -45.7196 -8.1218 -168.9514 rest sitting A 62 True
2019-01-19 17:22:49.800 0.707500 0.390500 0.181500 -100.5976 -33.9634 -177.6098 rest sitting A 62 True
2019-01-19 17:22:52.000 0.371333 0.580333 0.323333 44.3416 75.4390 132.6100 rest sitting A 62 True
2019-01-19 17:22:52.200 0.744500 0.272000 0.261500 23.2562 22.4388 196.3294 rest sitting A 62 True
2019-01-19 17:22:59.600 0.800000 -0.406333 0.121667 -35.7436 28.3414 127.8780 rest sitting A 62 True
2019-01-19 17:25:40.400 0.623500 -0.247500 0.226000 -5.7562 55.8294 175.6464 rest standing A 68 True
2019-01-19 17:25:40.600 0.669333 -0.957000 0.440667 -48.6584 85.9266 178.5730 rest standing A 68 True
2019-01-19 17:26:04.800 1.115000 -0.792500 0.809000 91.5732 21.1710 -209.7562 rest standing A 68 True
2019-01-19 17:26:05.000 0.714000 -0.101000 0.646000 98.5000 127.2804 -209.8538 rest standing A 68 True
2019-01-19 17:26:07.400 -0.217667 0.465667 0.320333 -108.5002 -117.7196 167.3170 rest standing A 68 True
2019-01-19 17:26:07.600 0.616000 -0.205500 0.625000 -207.6098 -96.8414 269.0854 rest standing A 68 True
2019-01-19 17:26:07.800 1.070333 -1.238333 0.751000 -79.0244 -103.5126 179.7196 rest standing A 68 True
2019-01-19 17:26:09.400 0.773667 -1.072000 0.310000 -8.6950 -61.7318 -162.9390 rest standing A 68 True
2019-01-19 17:26:09.600 1.464000 -0.902000 0.083000 -128.5364 -75.0488 -338.1708 rest standing A 68 True
2019-01-19 17:26:09.800 0.709333 -0.014667 -0.120667 -235.5244 -187.9512 -169.1826 rest standing A 68 True
2019-01-19 17:26:11.600 0.419000 0.175500 -0.176500 171.8048 103.7316 267.3416 rest standing A 68 True
2019-01-19 17:26:11.800 1.155333 -0.807667 -0.160333 -81.4878 233.1832 248.3416 rest standing A 68 True
2019-01-20 17:22:40.600 0.915500 -0.302500 -0.047500 16.5364 30.1952 119.8050 rest sitting E 54 True
2019-01-20 17:22:40.800 0.846667 -0.645667 0.174333 -49.5610 63.7196 104.2684 rest sitting E 54 True
2019-01-20 17:22:49.600 1.363667 0.030000 -0.001333 -45.7196 -8.1218 -168.9514 rest sitting E 54 True
2019-01-20 17:22:49.800 0.707500 0.390500 0.181500 -100.5976 -33.9634 -177.6098 rest sitting E 54 True
2019-01-20 17:22:52.000 0.371333 0.580333 0.323333 44.3416 75.4390 132.6100 rest sitting E 54 True
2019-01-20 17:22:52.200 0.744500 0.272000 0.261500 23.2562 22.4388 196.3294 rest sitting E 54 True
2019-01-20 17:22:59.600 0.800000 -0.406333 0.121667 -35.7436 28.3414 127.8780 rest sitting E 54 True
2019-01-20 17:25:40.400 0.623500 -0.247500 0.226000 -5.7562 55.8294 175.6464 rest standing E 44 True
2019-01-20 17:25:40.600 0.669333 -0.957000 0.440667 -48.6584 85.9266 178.5730 rest standing E 44 True
2019-01-20 17:26:04.800 1.115000 -0.792500 0.809000 91.5732 21.1710 -209.7562 rest standing E 44 True
2019-01-20 17:26:05.000 0.714000 -0.101000 0.646000 98.5000 127.2804 -209.8538 rest standing E 44 True
2019-01-20 17:26:07.400 -0.217667 0.465667 0.320333 -108.5002 -117.7196 167.3170 rest standing E 44 True
2019-01-20 17:26:07.600 0.616000 -0.205500 0.625000 -207.6098 -96.8414 269.0854 rest standing E 44 True
2019-01-20 17:26:07.800 1.070333 -1.238333 0.751000 -79.0244 -103.5126 179.7196 rest standing E 44 True
2019-01-20 17:26:09.400 0.773667 -1.072000 0.310000 -8.6950 -61.7318 -162.9390 rest standing E 44 True
2019-01-20 17:26:09.600 1.464000 -0.902000 0.083000 -128.5364 -75.0488 -338.1708 rest standing E 44 True
2019-01-20 17:26:09.800 0.709333 -0.014667 -0.120667 -235.5244 -187.9512 -169.1826 rest standing E 44 True
2019-01-20 17:26:11.600 0.419000 0.175500 -0.176500 171.8048 103.7316 267.3416 rest standing E 44 True
2019-01-20 17:26:11.800 1.155333 -0.807667 -0.160333 -81.4878 233.1832 248.3416 rest standing E 44 True
In [ ]:
dataset.loc[dataset["gyr_z_outlier"], "gyr_z"] = np.nan
In [ ]:
dataset
Out[ ]:
acc_x acc_y acc_z gyr_x gyr_y gyr_z label category participant set gyr_z_outlier
epoch (ms)
2019-01-11 15:08:05.200 0.013500 0.977000 -0.071000 -1.8904 2.4392 0.9388 bench heavy B 64 False
2019-01-11 15:08:05.400 -0.001500 0.970500 -0.079500 -1.6826 -0.8904 2.1708 bench heavy B 64 False
2019-01-11 15:08:05.600 0.001333 0.971667 -0.064333 2.5608 -0.2560 -1.4146 bench heavy B 64 False
2019-01-11 15:08:05.800 -0.024000 0.957000 -0.073500 8.0610 -4.5244 -2.0730 bench heavy B 64 False
2019-01-11 15:08:06.000 -0.028000 0.957667 -0.115000 2.4390 -1.5486 -3.6098 bench heavy B 64 False
... ... ... ... ... ... ... ... ... ... ... ...
2019-01-20 17:33:27.000 -0.048000 -1.041500 -0.076500 1.4146 -5.6218 0.2926 row medium E 71 False
2019-01-20 17:33:27.200 -0.037000 -1.030333 -0.053333 -2.7684 -0.5854 2.2440 row medium E 71 False
2019-01-20 17:33:27.400 -0.060000 -1.031000 -0.082000 2.8416 -5.1342 -0.1220 row medium E 71 False
2019-01-20 17:33:27.600 -0.038667 -1.025667 -0.044667 -0.2318 0.2562 1.1220 row medium E 71 False
2019-01-20 17:33:27.800 -0.044000 -1.034000 -0.059000 1.0980 -4.0240 0.9760 row medium E 71 False

9009 rows × 11 columns

In [ ]:
dataset[dataset["gyr_z_outlier"]] # outliers in dataset
Out[ ]:
acc_x acc_y acc_z gyr_x gyr_y gyr_z label category participant set gyr_z_outlier
epoch (ms)
2019-01-14 13:57:41.800 -0.137000 1.495500 0.217000 33.9146 14.6462 NaN ohp heavy C 34 True
2019-01-18 17:22:40.600 0.915500 -0.302500 -0.047500 16.5364 30.1952 NaN rest sitting A 6 True
2019-01-18 17:22:40.800 0.846667 -0.645667 0.174333 -49.5610 63.7196 NaN rest sitting A 6 True
2019-01-18 17:22:49.600 1.363667 0.030000 -0.001333 -45.7196 -8.1218 NaN rest sitting A 6 True
2019-01-18 17:22:49.800 0.707500 0.390500 0.181500 -100.5976 -33.9634 NaN rest sitting A 6 True
2019-01-18 17:22:52.000 0.371333 0.580333 0.323333 44.3416 75.4390 NaN rest sitting A 6 True
2019-01-18 17:22:52.200 0.744500 0.272000 0.261500 23.2562 22.4388 NaN rest sitting A 6 True
2019-01-18 17:22:59.600 0.800000 -0.406333 0.121667 -35.7436 28.3414 NaN rest sitting A 6 True
2019-01-18 17:25:40.400 0.623500 -0.247500 0.226000 -5.7562 55.8294 NaN rest standing A 36 True
2019-01-18 17:25:40.600 0.669333 -0.957000 0.440667 -48.6584 85.9266 NaN rest standing A 36 True
2019-01-18 17:26:04.800 1.115000 -0.792500 0.809000 91.5732 21.1710 NaN rest standing A 36 True
2019-01-18 17:26:05.000 0.714000 -0.101000 0.646000 98.5000 127.2804 NaN rest standing A 36 True
2019-01-18 17:26:07.400 -0.217667 0.465667 0.320333 -108.5002 -117.7196 NaN rest standing A 36 True
2019-01-18 17:26:07.600 0.616000 -0.205500 0.625000 -207.6098 -96.8414 NaN rest standing A 36 True
2019-01-18 17:26:07.800 1.070333 -1.238333 0.751000 -79.0244 -103.5126 NaN rest standing A 36 True
2019-01-18 17:26:09.400 0.773667 -1.072000 0.310000 -8.6950 -61.7318 NaN rest standing A 36 True
2019-01-18 17:26:09.600 1.464000 -0.902000 0.083000 -128.5364 -75.0488 NaN rest standing A 36 True
2019-01-18 17:26:09.800 0.709333 -0.014667 -0.120667 -235.5244 -187.9512 NaN rest standing A 36 True
2019-01-18 17:26:11.600 0.419000 0.175500 -0.176500 171.8048 103.7316 NaN rest standing A 36 True
2019-01-18 17:26:11.800 1.155333 -0.807667 -0.160333 -81.4878 233.1832 NaN rest standing A 36 True
2019-01-19 17:22:40.600 0.915500 -0.302500 -0.047500 16.5364 30.1952 NaN rest sitting A 62 True
2019-01-19 17:22:40.800 0.846667 -0.645667 0.174333 -49.5610 63.7196 NaN rest sitting A 62 True
2019-01-19 17:22:49.600 1.363667 0.030000 -0.001333 -45.7196 -8.1218 NaN rest sitting A 62 True
2019-01-19 17:22:49.800 0.707500 0.390500 0.181500 -100.5976 -33.9634 NaN rest sitting A 62 True
2019-01-19 17:22:52.000 0.371333 0.580333 0.323333 44.3416 75.4390 NaN rest sitting A 62 True
2019-01-19 17:22:52.200 0.744500 0.272000 0.261500 23.2562 22.4388 NaN rest sitting A 62 True
2019-01-19 17:22:59.600 0.800000 -0.406333 0.121667 -35.7436 28.3414 NaN rest sitting A 62 True
2019-01-19 17:25:40.400 0.623500 -0.247500 0.226000 -5.7562 55.8294 NaN rest standing A 68 True
2019-01-19 17:25:40.600 0.669333 -0.957000 0.440667 -48.6584 85.9266 NaN rest standing A 68 True
2019-01-19 17:26:04.800 1.115000 -0.792500 0.809000 91.5732 21.1710 NaN rest standing A 68 True
2019-01-19 17:26:05.000 0.714000 -0.101000 0.646000 98.5000 127.2804 NaN rest standing A 68 True
2019-01-19 17:26:07.400 -0.217667 0.465667 0.320333 -108.5002 -117.7196 NaN rest standing A 68 True
2019-01-19 17:26:07.600 0.616000 -0.205500 0.625000 -207.6098 -96.8414 NaN rest standing A 68 True
2019-01-19 17:26:07.800 1.070333 -1.238333 0.751000 -79.0244 -103.5126 NaN rest standing A 68 True
2019-01-19 17:26:09.400 0.773667 -1.072000 0.310000 -8.6950 -61.7318 NaN rest standing A 68 True
2019-01-19 17:26:09.600 1.464000 -0.902000 0.083000 -128.5364 -75.0488 NaN rest standing A 68 True
2019-01-19 17:26:09.800 0.709333 -0.014667 -0.120667 -235.5244 -187.9512 NaN rest standing A 68 True
2019-01-19 17:26:11.600 0.419000 0.175500 -0.176500 171.8048 103.7316 NaN rest standing A 68 True
2019-01-19 17:26:11.800 1.155333 -0.807667 -0.160333 -81.4878 233.1832 NaN rest standing A 68 True
2019-01-20 17:22:40.600 0.915500 -0.302500 -0.047500 16.5364 30.1952 NaN rest sitting E 54 True
2019-01-20 17:22:40.800 0.846667 -0.645667 0.174333 -49.5610 63.7196 NaN rest sitting E 54 True
2019-01-20 17:22:49.600 1.363667 0.030000 -0.001333 -45.7196 -8.1218 NaN rest sitting E 54 True
2019-01-20 17:22:49.800 0.707500 0.390500 0.181500 -100.5976 -33.9634 NaN rest sitting E 54 True
2019-01-20 17:22:52.000 0.371333 0.580333 0.323333 44.3416 75.4390 NaN rest sitting E 54 True
2019-01-20 17:22:52.200 0.744500 0.272000 0.261500 23.2562 22.4388 NaN rest sitting E 54 True
2019-01-20 17:22:59.600 0.800000 -0.406333 0.121667 -35.7436 28.3414 NaN rest sitting E 54 True
2019-01-20 17:25:40.400 0.623500 -0.247500 0.226000 -5.7562 55.8294 NaN rest standing E 44 True
2019-01-20 17:25:40.600 0.669333 -0.957000 0.440667 -48.6584 85.9266 NaN rest standing E 44 True
2019-01-20 17:26:04.800 1.115000 -0.792500 0.809000 91.5732 21.1710 NaN rest standing E 44 True
2019-01-20 17:26:05.000 0.714000 -0.101000 0.646000 98.5000 127.2804 NaN rest standing E 44 True
2019-01-20 17:26:07.400 -0.217667 0.465667 0.320333 -108.5002 -117.7196 NaN rest standing E 44 True
2019-01-20 17:26:07.600 0.616000 -0.205500 0.625000 -207.6098 -96.8414 NaN rest standing E 44 True
2019-01-20 17:26:07.800 1.070333 -1.238333 0.751000 -79.0244 -103.5126 NaN rest standing E 44 True
2019-01-20 17:26:09.400 0.773667 -1.072000 0.310000 -8.6950 -61.7318 NaN rest standing E 44 True
2019-01-20 17:26:09.600 1.464000 -0.902000 0.083000 -128.5364 -75.0488 NaN rest standing E 44 True
2019-01-20 17:26:09.800 0.709333 -0.014667 -0.120667 -235.5244 -187.9512 NaN rest standing E 44 True
2019-01-20 17:26:11.600 0.419000 0.175500 -0.176500 171.8048 103.7316 NaN rest standing E 44 True
2019-01-20 17:26:11.800 1.155333 -0.807667 -0.160333 -81.4878 233.1832 NaN rest standing E 44 True
In [ ]:
dataset.loc[dataset["gyr_z_outlier"], "gyr_z"] = np.nan
In [ ]:
dataset[dataset["gyr_z_outlier"]] # outliers in dataset
Out[ ]:
acc_x acc_y acc_z gyr_x gyr_y gyr_z label category participant set gyr_z_outlier
epoch (ms)
2019-01-14 13:57:41.800 -0.137000 1.495500 0.217000 33.9146 14.6462 NaN ohp heavy C 34 True
2019-01-18 17:22:40.600 0.915500 -0.302500 -0.047500 16.5364 30.1952 NaN rest sitting A 6 True
2019-01-18 17:22:40.800 0.846667 -0.645667 0.174333 -49.5610 63.7196 NaN rest sitting A 6 True
2019-01-18 17:22:49.600 1.363667 0.030000 -0.001333 -45.7196 -8.1218 NaN rest sitting A 6 True
2019-01-18 17:22:49.800 0.707500 0.390500 0.181500 -100.5976 -33.9634 NaN rest sitting A 6 True
2019-01-18 17:22:52.000 0.371333 0.580333 0.323333 44.3416 75.4390 NaN rest sitting A 6 True
2019-01-18 17:22:52.200 0.744500 0.272000 0.261500 23.2562 22.4388 NaN rest sitting A 6 True
2019-01-18 17:22:59.600 0.800000 -0.406333 0.121667 -35.7436 28.3414 NaN rest sitting A 6 True
2019-01-18 17:25:40.400 0.623500 -0.247500 0.226000 -5.7562 55.8294 NaN rest standing A 36 True
2019-01-18 17:25:40.600 0.669333 -0.957000 0.440667 -48.6584 85.9266 NaN rest standing A 36 True
2019-01-18 17:26:04.800 1.115000 -0.792500 0.809000 91.5732 21.1710 NaN rest standing A 36 True
2019-01-18 17:26:05.000 0.714000 -0.101000 0.646000 98.5000 127.2804 NaN rest standing A 36 True
2019-01-18 17:26:07.400 -0.217667 0.465667 0.320333 -108.5002 -117.7196 NaN rest standing A 36 True
2019-01-18 17:26:07.600 0.616000 -0.205500 0.625000 -207.6098 -96.8414 NaN rest standing A 36 True
2019-01-18 17:26:07.800 1.070333 -1.238333 0.751000 -79.0244 -103.5126 NaN rest standing A 36 True
2019-01-18 17:26:09.400 0.773667 -1.072000 0.310000 -8.6950 -61.7318 NaN rest standing A 36 True
2019-01-18 17:26:09.600 1.464000 -0.902000 0.083000 -128.5364 -75.0488 NaN rest standing A 36 True
2019-01-18 17:26:09.800 0.709333 -0.014667 -0.120667 -235.5244 -187.9512 NaN rest standing A 36 True
2019-01-18 17:26:11.600 0.419000 0.175500 -0.176500 171.8048 103.7316 NaN rest standing A 36 True
2019-01-18 17:26:11.800 1.155333 -0.807667 -0.160333 -81.4878 233.1832 NaN rest standing A 36 True
2019-01-19 17:22:40.600 0.915500 -0.302500 -0.047500 16.5364 30.1952 NaN rest sitting A 62 True
2019-01-19 17:22:40.800 0.846667 -0.645667 0.174333 -49.5610 63.7196 NaN rest sitting A 62 True
2019-01-19 17:22:49.600 1.363667 0.030000 -0.001333 -45.7196 -8.1218 NaN rest sitting A 62 True
2019-01-19 17:22:49.800 0.707500 0.390500 0.181500 -100.5976 -33.9634 NaN rest sitting A 62 True
2019-01-19 17:22:52.000 0.371333 0.580333 0.323333 44.3416 75.4390 NaN rest sitting A 62 True
2019-01-19 17:22:52.200 0.744500 0.272000 0.261500 23.2562 22.4388 NaN rest sitting A 62 True
2019-01-19 17:22:59.600 0.800000 -0.406333 0.121667 -35.7436 28.3414 NaN rest sitting A 62 True
2019-01-19 17:25:40.400 0.623500 -0.247500 0.226000 -5.7562 55.8294 NaN rest standing A 68 True
2019-01-19 17:25:40.600 0.669333 -0.957000 0.440667 -48.6584 85.9266 NaN rest standing A 68 True
2019-01-19 17:26:04.800 1.115000 -0.792500 0.809000 91.5732 21.1710 NaN rest standing A 68 True
2019-01-19 17:26:05.000 0.714000 -0.101000 0.646000 98.5000 127.2804 NaN rest standing A 68 True
2019-01-19 17:26:07.400 -0.217667 0.465667 0.320333 -108.5002 -117.7196 NaN rest standing A 68 True
2019-01-19 17:26:07.600 0.616000 -0.205500 0.625000 -207.6098 -96.8414 NaN rest standing A 68 True
2019-01-19 17:26:07.800 1.070333 -1.238333 0.751000 -79.0244 -103.5126 NaN rest standing A 68 True
2019-01-19 17:26:09.400 0.773667 -1.072000 0.310000 -8.6950 -61.7318 NaN rest standing A 68 True
2019-01-19 17:26:09.600 1.464000 -0.902000 0.083000 -128.5364 -75.0488 NaN rest standing A 68 True
2019-01-19 17:26:09.800 0.709333 -0.014667 -0.120667 -235.5244 -187.9512 NaN rest standing A 68 True
2019-01-19 17:26:11.600 0.419000 0.175500 -0.176500 171.8048 103.7316 NaN rest standing A 68 True
2019-01-19 17:26:11.800 1.155333 -0.807667 -0.160333 -81.4878 233.1832 NaN rest standing A 68 True
2019-01-20 17:22:40.600 0.915500 -0.302500 -0.047500 16.5364 30.1952 NaN rest sitting E 54 True
2019-01-20 17:22:40.800 0.846667 -0.645667 0.174333 -49.5610 63.7196 NaN rest sitting E 54 True
2019-01-20 17:22:49.600 1.363667 0.030000 -0.001333 -45.7196 -8.1218 NaN rest sitting E 54 True
2019-01-20 17:22:49.800 0.707500 0.390500 0.181500 -100.5976 -33.9634 NaN rest sitting E 54 True
2019-01-20 17:22:52.000 0.371333 0.580333 0.323333 44.3416 75.4390 NaN rest sitting E 54 True
2019-01-20 17:22:52.200 0.744500 0.272000 0.261500 23.2562 22.4388 NaN rest sitting E 54 True
2019-01-20 17:22:59.600 0.800000 -0.406333 0.121667 -35.7436 28.3414 NaN rest sitting E 54 True
2019-01-20 17:25:40.400 0.623500 -0.247500 0.226000 -5.7562 55.8294 NaN rest standing E 44 True
2019-01-20 17:25:40.600 0.669333 -0.957000 0.440667 -48.6584 85.9266 NaN rest standing E 44 True
2019-01-20 17:26:04.800 1.115000 -0.792500 0.809000 91.5732 21.1710 NaN rest standing E 44 True
2019-01-20 17:26:05.000 0.714000 -0.101000 0.646000 98.5000 127.2804 NaN rest standing E 44 True
2019-01-20 17:26:07.400 -0.217667 0.465667 0.320333 -108.5002 -117.7196 NaN rest standing E 44 True
2019-01-20 17:26:07.600 0.616000 -0.205500 0.625000 -207.6098 -96.8414 NaN rest standing E 44 True
2019-01-20 17:26:07.800 1.070333 -1.238333 0.751000 -79.0244 -103.5126 NaN rest standing E 44 True
2019-01-20 17:26:09.400 0.773667 -1.072000 0.310000 -8.6950 -61.7318 NaN rest standing E 44 True
2019-01-20 17:26:09.600 1.464000 -0.902000 0.083000 -128.5364 -75.0488 NaN rest standing E 44 True
2019-01-20 17:26:09.800 0.709333 -0.014667 -0.120667 -235.5244 -187.9512 NaN rest standing E 44 True
2019-01-20 17:26:11.600 0.419000 0.175500 -0.176500 171.8048 103.7316 NaN rest standing E 44 True
2019-01-20 17:26:11.800 1.155333 -0.807667 -0.160333 -81.4878 233.1832 NaN rest standing E 44 True
In [ ]:
removed_outliers_df = df.copy()
In [ ]:
for col in outlier_columns:
    for label in df["label"].unique():
        dataset = mark_outliers_chauvenet(df[df["label"] == label], col)
        # replace outliers with nan
        dataset.loc[dataset[col + "_outlier"], col] = np.nan
        # update column in the original dataset
        removed_outliers_df.loc[(removed_outliers_df["label"] == label), col] = dataset[col]
        # count how many outliers were removed
        outliers_no = len(df) - len(removed_outliers_df[col].dropna())
        print(f"Removed {outliers_no} from {col} for {label}")
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
Removed 0 from acc_x for bench
Removed 2 from acc_x for ohp
Removed 2 from acc_x for squat
Removed 4 from acc_x for dead
Removed 4 from acc_x for row
Removed 4 from acc_x for rest
Removed 5 from acc_y for bench
Removed 11 from acc_y for ohp
Removed 11 from acc_y for squat
Removed 11 from acc_y for dead
Removed 11 from acc_y for row
Removed 11 from acc_y for rest
Removed 3 from acc_z for bench
Removed 9 from acc_z for ohp
Removed 9 from acc_z for squat
Removed 10 from acc_z for dead
Removed 10 from acc_z for row
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
Removed 10 from acc_z for rest
Removed 2 from gyr_x for bench
Removed 6 from gyr_x for ohp
Removed 7 from gyr_x for squat
Removed 13 from gyr_x for dead
Removed 13 from gyr_x for row
Removed 25 from gyr_x for rest
Removed 14 from gyr_y for bench
Removed 29 from gyr_y for ohp
Removed 38 from gyr_y for squat
Removed 52 from gyr_y for dead
Removed 62 from gyr_y for row
Removed 71 from gyr_y for rest
Removed 13 from gyr_z for bench
Removed 14 from gyr_z for ohp
Removed 26 from gyr_z for squat
Removed 40 from gyr_z for dead
Removed 40 from gyr_z for row
Removed 64 from gyr_z for rest
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
In [ ]:
print(f"Removed {outliers_no} from {col} for {label}")
Removed 64 from gyr_z for rest
In [ ]:
for col in outlier_columns:
    for label in df["label"].unique():
        dataset = mark_outliers_chauvenet(df[df["label"] == label], col)
        # replace outliers with nan
        dataset.loc[dataset[col + "_outlier"], col] = np.nan
        # update column in the original dataset
        removed_outliers_df.loc[(removed_outliers_df["label"] == label), col] = dataset[col]
        # count how many outliers were removed
        outliers_no = len(df) - len(removed_outliers_df[col].dropna())
        print(f"Removed {outliers_no} from {col} for {label}")
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
Removed 4 from acc_x for bench
Removed 4 from acc_x for ohp
Removed 4 from acc_x for squat
Removed 4 from acc_x for dead
Removed 4 from acc_x for row
Removed 4 from acc_x for rest
Removed 11 from acc_y for bench
Removed 11 from acc_y for ohp
Removed 11 from acc_y for squat
Removed 11 from acc_y for dead
Removed 11 from acc_y for row
Removed 11 from acc_y for rest
Removed 10 from acc_z for bench
Removed 10 from acc_z for ohp
Removed 10 from acc_z for squat
Removed 10 from acc_z for dead
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
Removed 10 from acc_z for row
Removed 10 from acc_z for rest
Removed 25 from gyr_x for bench
Removed 25 from gyr_x for ohp
Removed 25 from gyr_x for squat
Removed 25 from gyr_x for dead
Removed 25 from gyr_x for row
Removed 25 from gyr_x for rest
Removed 71 from gyr_y for bench
Removed 71 from gyr_y for ohp
Removed 71 from gyr_y for squat
Removed 71 from gyr_y for dead
Removed 71 from gyr_y for row
Removed 71 from gyr_y for rest
Removed 64 from gyr_z for bench
Removed 64 from gyr_z for ohp
Removed 64 from gyr_z for squat
Removed 64 from gyr_z for dead
Removed 64 from gyr_z for row
Removed 64 from gyr_z for rest
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
In [ ]:
removed_outliers_df = df.copy()
for col in outlier_columns:
    for label in df["label"].unique():
        dataset = mark_outliers_chauvenet(df[df["label"] == label], col)
        # replace outliers with nan
        dataset.loc[dataset[col + "_outlier"], col] = np.nan
        # update column in the original dataset
        removed_outliers_df.loc[(removed_outliers_df["label"] == label), col] = dataset[col]
        # count how many outliers were removed
        outliers_no = len(dataset) - len(removed_outliers_df[col].dropna())
        print(f"Removed {outliers_no} from {col} for {label}")
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
Removed -7344 from acc_x for bench
Removed -7331 from acc_x for ohp
Removed -7397 from acc_x for squat
Removed -7474 from acc_x for dead
Removed -7588 from acc_x for row
Removed -7895 from acc_x for rest
Removed -7339 from acc_y for bench
Removed -7322 from acc_y for ohp
Removed -7388 from acc_y for squat
Removed -7467 from acc_y for dead
Removed -7581 from acc_y for row
Removed -7888 from acc_y for rest
Removed -7341 from acc_z for bench
Removed -7324 from acc_z for ohp
Removed -7390 from acc_z for squat
Removed -7468 from acc_z for dead
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
Removed -7582 from acc_z for row
Removed -7889 from acc_z for rest
Removed -7342 from gyr_x for bench
Removed -7327 from gyr_x for ohp
Removed -7392 from gyr_x for squat
Removed -7465 from gyr_x for dead
Removed -7579 from gyr_x for row
Removed -7874 from gyr_x for rest
Removed -7330 from gyr_y for bench
Removed -7304 from gyr_y for ohp
Removed -7361 from gyr_y for squat
Removed -7426 from gyr_y for dead
Removed -7530 from gyr_y for row
Removed -7828 from gyr_y for rest
Removed -7331 from gyr_z for bench
Removed -7319 from gyr_z for ohp
Removed -7373 from gyr_z for squat
Removed -7438 from gyr_z for dead
Removed -7552 from gyr_z for row
Removed -7835 from gyr_z for rest
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
In [ ]:
for col in outlier_columns:
    for label in df["label"].unique():
        dataset = mark_outliers_chauvenet(df[df["label"] == label], col)
        # replace outliers with nan
        dataset.loc[dataset[col + "_outlier"], col] = np.nan
        # update column in the original dataset
        removed_outliers_df.loc[(removed_outliers_df["label"] == label), col] = dataset[col]
        # count how many outliers were removed
        outliers_no = len(dataset) - len(dataset[col].dropna())
        print(f"Removed {outliers_no} from {col} for {label}")
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
Removed 0 from acc_x for bench
Removed 2 from acc_x for ohp
Removed 0 from acc_x for squat
Removed 2 from acc_x for dead
Removed 0 from acc_x for row
Removed 0 from acc_x for rest
Removed 5 from acc_y for bench
Removed 6 from acc_y for ohp
Removed 0 from acc_y for squat
Removed 0 from acc_y for dead
Removed 0 from acc_y for row
Removed 0 from acc_y for rest
Removed 3 from acc_z for bench
Removed 6 from acc_z for ohp
Removed 0 from acc_z for squat
Removed 1 from acc_z for dead
Removed 0 from acc_z for row
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
Removed 0 from acc_z for rest
Removed 2 from gyr_x for bench
Removed 4 from gyr_x for ohp
Removed 1 from gyr_x for squat
Removed 6 from gyr_x for dead
Removed 0 from gyr_x for row
Removed 12 from gyr_x for rest
Removed 14 from gyr_y for bench
Removed 15 from gyr_y for ohp
Removed 9 from gyr_y for squat
Removed 14 from gyr_y for dead
Removed 10 from gyr_y for row
Removed 9 from gyr_y for rest
Removed 13 from gyr_z for bench
Removed 1 from gyr_z for ohp
Removed 12 from gyr_z for squat
Removed 14 from gyr_z for dead
Removed 0 from gyr_z for row
Removed 24 from gyr_z for rest
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
<ipython-input-3-b374445ec478>:142: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
In [ ]:
removed_outliers_df
Out[ ]:
acc_x acc_y acc_z gyr_x gyr_y gyr_z label category participant set
epoch (ms)
2019-01-11 15:08:05.200 0.013500 0.977000 -0.071000 -1.8904 2.4392 0.9388 bench heavy B 64
2019-01-11 15:08:05.400 -0.001500 0.970500 -0.079500 -1.6826 -0.8904 2.1708 bench heavy B 64
2019-01-11 15:08:05.600 0.001333 0.971667 -0.064333 2.5608 -0.2560 -1.4146 bench heavy B 64
2019-01-11 15:08:05.800 -0.024000 0.957000 -0.073500 8.0610 -4.5244 -2.0730 bench heavy B 64
2019-01-11 15:08:06.000 -0.028000 0.957667 -0.115000 2.4390 -1.5486 -3.6098 bench heavy B 64
... ... ... ... ... ... ... ... ... ... ...
2019-01-20 17:33:27.000 -0.048000 -1.041500 -0.076500 1.4146 -5.6218 0.2926 row medium E 71
2019-01-20 17:33:27.200 -0.037000 -1.030333 -0.053333 -2.7684 -0.5854 2.2440 row medium E 71
2019-01-20 17:33:27.400 -0.060000 -1.031000 -0.082000 2.8416 -5.1342 -0.1220 row medium E 71
2019-01-20 17:33:27.600 -0.038667 -1.025667 -0.044667 -0.2318 0.2562 1.1220 row medium E 71
2019-01-20 17:33:27.800 -0.044000 -1.034000 -0.059000 1.0980 -4.0240 0.9760 row medium E 71

9009 rows × 10 columns

In [ ]:
removed_outliers_df.info()
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 9009 entries, 2019-01-11 15:08:05.200000 to 2019-01-20 17:33:27.800000
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   acc_x        9005 non-null   float64
 1   acc_y        8998 non-null   float64
 2   acc_z        8999 non-null   float64
 3   gyr_x        8984 non-null   float64
 4   gyr_y        8938 non-null   float64
 5   gyr_z        8945 non-null   float64
 6   label        9009 non-null   object 
 7   category     9009 non-null   object 
 8   participant  9009 non-null   object 
 9   set          9009 non-null   int64  
dtypes: float64(6), int64(1), object(3)
memory usage: 774.2+ KB
In [ ]:
removed_outliers_df.to_pickle("../../interim/removed_outliers_chauvenet_02.pkl")
---------------------------------------------------------------------------
OSError                                   Traceback (most recent call last)
/Users/bogdanduminica/Desktop/tracking-barbell-exercises/src/features/remove_outliers.py in line 1
----> <a href='file:///Users/bogdanduminica/Desktop/tracking-barbell-exercises/src/features/remove_outliers.py?line=285'>286</a> removed_outliers_df.to_pickle("../../interim/removed_outliers_chauvenet_02.pkl")

File /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/pandas/util/_decorators.py:333, in deprecate_nonkeyword_arguments.<locals>.decorate.<locals>.wrapper(*args, **kwargs)
    <a href='file:///Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/pandas/util/_decorators.py?line=326'>327</a> if len(args) > num_allow_args:
    <a href='file:///Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/pandas/util/_decorators.py?line=327'>328</a>     warnings.warn(
    <a href='file:///Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/pandas/util/_decorators.py?line=328'>329</a>         msg.format(arguments=_format_argument_list(allow_args)),
    <a href='file:///Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/pandas/util/_decorators.py?line=329'>330</a>         FutureWarning,
    <a href='file:///Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/pandas/util/_decorators.py?line=330'>331</a>         stacklevel=find_stack_level(),
    <a href='file:///Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/pandas/util/_decorators.py?line=331'>332</a>     )
--> <a href='file:///Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/pandas/util/_decorators.py?line=332'>333</a> return func(*args, **kwargs)

File /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/pandas/core/generic.py:3165, in NDFrame.to_pickle(self, path, compression, protocol, storage_options)
   <a href='file:///Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/pandas/core/generic.py?line=3114'>3115</a> """
   <a href='file:///Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/pandas/core/generic.py?line=3115'>3116</a> Pickle (serialize) object to file.
   <a href='file:///Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/pandas/core/generic.py?line=3116'>3117</a> 
   (...)
   <a href='file:///Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/pandas/core/generic.py?line=3160'>3161</a> 4    4    9
   <a href='file:///Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/pandas/core/generic.py?line=3161'>3162</a> """  # noqa: E501
   <a href='file:///Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/pandas/core/generic.py?line=3162'>3163</a> from pandas.io.pickle import to_pickle
-> <a href='file:///Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/pandas/core/generic.py?line=3164'>3165</a> to_pickle(
   <a href='file:///Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/pandas/core/generic.py?line=3165'>3166</a>     self,
   <a href='file:///Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/pandas/core/generic.py?line=3166'>3167</a>     path,
   <a href='file:///Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/pandas/core/generic.py?line=3167'>3168</a>     compression=compression,
   <a href='file:///Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/pandas/core/generic.py?line=3168'>3169</a>     protocol=protocol,
   <a href='file:///Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/pandas/core/generic.py?line=3169'>3170</a>     storage_options=storage_options,
   <a href='file:///Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/pandas/core/generic.py?line=3170'>3171</a> )

File /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/pandas/io/pickle.py:103, in to_pickle(obj, filepath_or_buffer, compression, protocol, storage_options)
    <a href='file:///Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/pandas/io/pickle.py?line=99'>100</a> if protocol < 0:
    <a href='file:///Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/pandas/io/pickle.py?line=100'>101</a>     protocol = pickle.HIGHEST_PROTOCOL
--> <a href='file:///Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/pandas/io/pickle.py?line=102'>103</a> with get_handle(
    <a href='file:///Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/pandas/io/pickle.py?line=103'>104</a>     filepath_or_buffer,
    <a href='file:///Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/pandas/io/pickle.py?line=104'>105</a>     "wb",
    <a href='file:///Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/pandas/io/pickle.py?line=105'>106</a>     compression=compression,
    <a href='file:///Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/pandas/io/pickle.py?line=106'>107</a>     is_text=False,
    <a href='file:///Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/pandas/io/pickle.py?line=107'>108</a>     storage_options=storage_options,
    <a href='file:///Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/pandas/io/pickle.py?line=108'>109</a> ) as handles:
    <a href='file:///Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/pandas/io/pickle.py?line=109'>110</a>     # letting pickle write directly to the buffer is more memory-efficient
    <a href='file:///Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/pandas/io/pickle.py?line=110'>111</a>     pickle.dump(obj, handles.handle, protocol=protocol)

File /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/pandas/io/common.py:749, in get_handle(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)
    <a href='file:///Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/pandas/io/common.py?line=746'>747</a> # Only for write methods
    <a href='file:///Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/pandas/io/common.py?line=747'>748</a> if "r" not in mode and is_path:
--> <a href='file:///Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/pandas/io/common.py?line=748'>749</a>     check_parent_directory(str(handle))
    <a href='file:///Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/pandas/io/common.py?line=750'>751</a> if compression:
    <a href='file:///Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/pandas/io/common.py?line=751'>752</a>     if compression != "zstd":
    <a href='file:///Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/pandas/io/common.py?line=752'>753</a>         # compression libraries do not like an explicit text-mode

File /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/pandas/io/common.py:616, in check_parent_directory(path)
    <a href='file:///Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/pandas/io/common.py?line=613'>614</a> parent = Path(path).parent
    <a href='file:///Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/pandas/io/common.py?line=614'>615</a> if not parent.is_dir():
--> <a href='file:///Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/pandas/io/common.py?line=615'>616</a>     raise OSError(rf"Cannot save file into a non-existent directory: '{parent}'")

OSError: Cannot save file into a non-existent directory: '../../interim'
In [ ]:
removed_outliers_df.to_pickle("../../data/interim/removed_outliers_chauvenet_02.pkl")